// Copyright (C) 2024 Kumo inc.
// Author: Jeff.li lijippy@163.com
// All rights reserved.
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published
// by the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program.  If not, see <https://www.gnu.org/licenses/>.
//

#pragma once

#include "llama.h"

#include <unordered_map>
#include <string>
#include <vector>

#define LLAMA_NGRAM_MIN    1
#define LLAMA_NGRAM_MAX    4
#define LLAMA_NGRAM_STATIC 2

// Data structures to map n-grams to empirical token probabilities:

namespace kllm {
    struct common_ngram {
        llama_token tokens[LLAMA_NGRAM_MAX];

        common_ngram() {
            for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
                tokens[i] = -1;
            }
        }

        common_ngram(const llama_token *input, const int ngram_size) {
            for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
                tokens[i] = i < ngram_size ? input[i] : -1;
            }
        }

        bool operator==(const common_ngram &other) const {
            for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
                if (tokens[i] != other.tokens[i]) {
                    return false;
                }
            }
            return true;
        }
    };

    struct common_token_hash_function {
        size_t operator()(const llama_token token) const {
            // see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
            return token * 11400714819323198485llu;
        }
    };

    struct common_ngram_hash_function {
        size_t operator()(const common_ngram &ngram) const {
            size_t hash = common_token_hash_function{}(ngram.tokens[0]);
            for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
                hash ^= common_token_hash_function{}(ngram.tokens[i]);
            }
            return hash;
        }
    };

    // token -> number of times token has been seen
    typedef std::unordered_map<llama_token, int32_t> common_ngram_cache_part;

    // n-gram -> empirical distribution of following tokens
    typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_hash_function> common_ngram_cache;


    // Update an ngram cache with tokens.
    // ngram_cache:         the cache to modify.
    // ngram_min/ngram_max: the min/max size of the ngrams to extract from inp_data.
    // inp_data:            the token sequence with which to update ngram_cache.
    // nnew:                how many new tokens have been appended to inp_data since the last call to this function.
    // print_progress:      whether to print progress to stderr.
    //
    // In order to get correct results inp_data can ONLY BE APPENDED TO.
    // Changes in the middle need a complete rebuild.
    void common_ngram_cache_update(
            common_ngram_cache &ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> &inp_data, int nnew,
            bool print_progress);

    // Try to draft tokens from ngram caches.
    // inp:                the tokens generated so far.
    // draft:              the token sequence to draft. Expected to initially contain the previously sampled token.
    // n_draft:            maximum number of tokens to add to draft.
    // ngram_min/gram_max: the min/max size of the ngrams in nc_context and nc_dynamic.
    // nc_context:         ngram cache based on current context.
    // nc_dynamic:         ngram cache based on previous user generations.
    // nc_static:          ngram cache generated from a large text corpus, used for validation.
    void common_ngram_cache_draft(
            std::vector<llama_token> &inp, std::vector<llama_token> &draft, int n_draft, int ngram_min, int ngram_max,
            common_ngram_cache &nc_context, common_ngram_cache &nc_dynamic, common_ngram_cache &nc_static);

    // Save an ngram cache to a file.
    // ngram_cache: the ngram cache to save.
    // filename:    the path under which to save the ngram cache.
    void common_ngram_cache_save(common_ngram_cache &ngram_cache, std::string &filename);

    // Load an ngram cache saved with common_ngram_cache_save.
    // filename: the path from which to load the ngram cache.
    // returns:  an ngram cache containing the information saved to filename.
    common_ngram_cache common_ngram_cache_load(std::string &filename);

    // Merge two ngram caches.
    // ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
    // ngram_cache_add:    the ngram cache to add to ngram_cache_target.
    void common_ngram_cache_merge(common_ngram_cache &ngram_cache_target, common_ngram_cache &ngram_cache_add);
}  // namespace kllm